library(gapminder)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(knitr)

Part 1: Factor management

summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

A better structure of the gapminder dataset

gapminder %>%
  group_by(continent) %>%
  knitr::kable() %>%
  head()
## [1] "country                    continent    year    lifeExp          pop     gdpPercap"
## [2] "-------------------------  ----------  -----  ---------  -----------  ------------"
## [3] "Afghanistan                Asia         1952   28.80100      8425333      779.4453"
## [4] "Afghanistan                Asia         1957   30.33200      9240934      820.8530"
## [5] "Afghanistan                Asia         1962   31.99700     10267083      853.1007"
## [6] "Afghanistan                Asia         1967   34.02000     11537966      836.1971"
gapminder %>%
summarize(
n_row = nrow(gapminder),
cont_levels = nlevels(gapminder$country),
coun_levels = nlevels(gapminder$continent)) %>%
knitr::kable(col.names = c("Rows", "Countries", "Continents"))
Rows Countries Continents
1704 142 5
gapminder %>%
  filter(continent == "Asia") %>%
  group_by(country) %>%
  select(country, gdpPercap) %>%
  knitr::kable(col.names = c("Country", "GDP Percap")) %>%
  head(10)
##  [1] "Country                GDP Percap"
##  [2] "-------------------  ------------"
##  [3] "Afghanistan              779.4453"
##  [4] "Afghanistan              820.8530"
##  [5] "Afghanistan              853.1007"
##  [6] "Afghanistan              836.1971"
##  [7] "Afghanistan              739.9811"
##  [8] "Afghanistan              786.1134"
##  [9] "Afghanistan              978.0114"
## [10] "Afghanistan              852.3959"

Using the “arrange” function

gapminder %>%
  filter(continent == "Asia") %>%
  select(country, gdpPercap, year) %>%
  mutate(max_GDP = max(gdpPercap)) %>%
  arrange(max_GDP) %>%
  knitr::kable() %>%
  head(20)
##  [1] "country                 gdpPercap   year    max_GDP"
##  [2] "-------------------  ------------  -----  ---------"
##  [3] "Afghanistan              779.4453   1952   113523.1"
##  [4] "Afghanistan              820.8530   1957   113523.1"
##  [5] "Afghanistan              853.1007   1962   113523.1"
##  [6] "Afghanistan              836.1971   1967   113523.1"
##  [7] "Afghanistan              739.9811   1972   113523.1"
##  [8] "Afghanistan              786.1134   1977   113523.1"
##  [9] "Afghanistan              978.0114   1982   113523.1"
## [10] "Afghanistan              852.3959   1987   113523.1"
## [11] "Afghanistan              649.3414   1992   113523.1"
## [12] "Afghanistan              635.3414   1997   113523.1"
## [13] "Afghanistan              726.7341   2002   113523.1"
## [14] "Afghanistan              974.5803   2007   113523.1"
## [15] "Bahrain                 9867.0848   1952   113523.1"
## [16] "Bahrain                11635.7995   1957   113523.1"
## [17] "Bahrain                12753.2751   1962   113523.1"
## [18] "Bahrain                14804.6727   1967   113523.1"
## [19] "Bahrain                18268.6584   1972   113523.1"
## [20] "Bahrain                19340.1020   1977   113523.1"

This shows the max gdpPercap by an Asian country in a certain year.

visualisation using ggplot

arranged_gap <- gapminder %>%
  filter(continent == "Asia") %>%
  select(country, gdpPercap, year) %>%
  mutate(max_GDP = max(gdpPercap)) %>%
  arrange(max_GDP)
arranged_gap %>%
  ggplot(aes(max_GDP, year, colour = country)) +
  geom_point()

this is weird. according to this graph, Yemen had the max gdp over the years.

arranged_gap %>%
  filter(country == "Bangladesh") %>%
  ggplot(aes(max_GDP, year)) +
  geom_point()

ok. dont know whats going on with GDP. Let me try with lifeExp.

gapminder %>%
  filter(continent == "Asia") %>%
  group_by(country) %>%
  select(country, lifeExp) %>%
  knitr::kable(col.names = c("Country", "Life Expectancy")) %>%
  head(10)
##  [1] "Country               Life Expectancy"
##  [2] "-------------------  ----------------"
##  [3] "Afghanistan                  28.80100"
##  [4] "Afghanistan                  30.33200"
##  [5] "Afghanistan                  31.99700"
##  [6] "Afghanistan                  34.02000"
##  [7] "Afghanistan                  36.08800"
##  [8] "Afghanistan                  38.43800"
##  [9] "Afghanistan                  39.85400"
## [10] "Afghanistan                  40.82200"
nlevels(gapminder$lifeExp)
## [1] 0
nlevels(gapminder$gdpPercap)
## [1] 0
nlevels(gapminder$pop) 
## [1] 0
nlevels(gapminder$country)  
## [1] 142
nlevels(gapminder$continent)
## [1] 5
nlevels(gapminder$year)
## [1] 0
arranged_gap <- gapminder %>%
  filter(continent == "Asia") %>%
  select(country, lifeExp, year) %>%
  mutate(max_LEx = max(lifeExp)) %>%
  arrange(max_LEx)
gapminder %>%
  filter(year > 1987) %>%
  filter(continent == "Asia") %>%
  select(country, year, lifeExp)
## # A tibble: 132 x 3
##    country      year lifeExp
##    <fct>       <int>   <dbl>
##  1 Afghanistan  1992    41.7
##  2 Afghanistan  1997    41.8
##  3 Afghanistan  2002    42.1
##  4 Afghanistan  2007    43.8
##  5 Bahrain      1992    72.6
##  6 Bahrain      1997    73.9
##  7 Bahrain      2002    74.8
##  8 Bahrain      2007    75.6
##  9 Bangladesh   1992    56.0
## 10 Bangladesh   1997    59.4
## # ... with 122 more rows
arranged_lifeEx <- gapminder %>%
  filter(year > 1987) %>%
  filter(continent == "Asia") %>%
  select(country, year, lifeExp)
arranged_lifeEx %>%
  ggplot(aes(lifeExp, year, colour = country)) +
  geom_point()

Dropping Oceania now.

gapminder %>%
  filter(continent != "Oceania") %>%
  droplevels()
## # A tibble: 1,680 x 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,670 more rows
levels(gapminder$continent)
## [1] "Africa"   "Americas" "Asia"     "Europe"   "Oceania"

Checking for the continents after dropping Oceania.

noOc_gap <- gapminder %>%
  filter(continent != "Oceania") %>%
  droplevels()
levels(noOc_gap$continent)
## [1] "Africa"   "Americas" "Asia"     "Europe"

Plotting for lifeExp after dropping Oceania

noOc_gap %>%
  ggplot(aes(continent, lifeExp, colour = continent)) +
  geom_jitter() +
  labs (x = "Continent",
        y = "Life Expectancy",
        title = "Life expectancy after dropping Oceania") +
  theme_bw()

Part 2: File input and output (File I/O)

arranged_lifeEx <- gapminder %>%
  filter(year > 1987) %>%
  filter(continent == "Asia") %>%
  select(country, year, lifeExp)

Trying to save the file as csv. as instructed in the assigment.

write.csv(arranged_lifeEx, file = "STAT545_hw05")

Trying to import using read.csv as instructed

read_csv("STAT545_hw05")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   country = col_character(),
##   year = col_integer(),
##   lifeExp = col_double()
## )
## # A tibble: 132 x 4
##       X1 country      year lifeExp
##    <int> <chr>       <int>   <dbl>
##  1     1 Afghanistan  1992    41.7
##  2     2 Afghanistan  1997    41.8
##  3     3 Afghanistan  2002    42.1
##  4     4 Afghanistan  2007    43.8
##  5     5 Bahrain      1992    72.6
##  6     6 Bahrain      1997    73.9
##  7     7 Bahrain      2002    74.8
##  8     8 Bahrain      2007    75.6
##  9     9 Bangladesh   1992    56.0
## 10    10 Bangladesh   1997    59.4
## # ... with 122 more rows

Part 3: Visualisation design

library(plotly)
noOc_gap %>%
  ggplot(aes(lifeExp, year)) +
  geom_point(aes(colour = pop)) +
  facet_wrap(~ continent) +
     theme_bw() +
  labs(x= "Life Expectancy", y = "Year") +
    theme(axis.text = element_text(size = 8),
          axis.title = element_text(size = 20),
          strip.background = element_rect(fill = "purple"))

this graph shows that Asia had very high population (light blue) at some points, and its position with respect to Year and Life expectance.

plot_ly(noOc_gap, 
        x = ~lifeExp, 
        y = ~year, 
        z = ~pop,
        type = "scatter",
        mode = "markers",
        opacity = 0.5) 
## Warning: 'scatter' objects don't have these attributes: 'z'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'text', 'hovertext', 'mode', 'hoveron', 'line', 'connectgaps', 'cliponaxis', 'fill', 'fillcolor', 'marker', 'selected', 'unselected', 'textposition', 'textfont', 'r', 't', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'hovertextsrc', 'textpositionsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

I would like to try the 3D version.

plot_ly(noOc_gap, 
        x = ~lifeExp, 
        y = ~year, 
        z = ~pop,
        type = "scatter3d",
        mode = "markers",
        opacity = 0.5) 
# "3D" did not work. try "3d". case sensitive. work eith extending at the edges to enlarge or shrink with trackpad.

Part 4: Writing figures to file

hw05_fig <- noOc_gap %>%
  ggplot(aes(lifeExp, year)) +
  geom_point(aes(colour = pop)) +
  facet_wrap(~ continent) +
     theme_bw() +
  labs(x= "Life Expectancy", y = "Year") +
    theme(axis.text = element_text(size = 8),
          axis.title = element_text(size = 20),
          strip.background = element_rect(fill = "purple"))
ggsave("hw05_plot.png", hw05_fig)
## Saving 7 x 5 in image

changing the scaling of the figure

ggsave("hw05_plot2.png", hw05_fig, width = 50, height = 40, units = "cm", dpi = 600)

hw05_plot is better looking than hw05_plot2